{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "import os\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_table(filename):\n",
    "\n",
    "    if os.path.isfile(filename):\n",
    "        df = pd.read_csv(filename, index_col='date')\n",
    "        return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get this csv file from https://github.com/fja05680/sp500\n",
    "* Historical Lists of S&P 500 components since 1996: S&P 500 Historical Components & Changes(MM-DD-YYYY).csv\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tickers</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>date</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1996-01-02</th>\n",
       "      <td>AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1996-01-03</th>\n",
       "      <td>AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1996-01-04</th>\n",
       "      <td>AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1996-01-10</th>\n",
       "      <td>AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1996-01-11</th>\n",
       "      <td>AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      tickers\n",
       "date                                                         \n",
       "1996-01-02  AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...\n",
       "1996-01-03  AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...\n",
       "1996-01-04  AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...\n",
       "1996-01-10  AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD...\n",
       "1996-01-11  AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,AD..."
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filename = 'S&P 500 Historical Components & Changes(08-12-2022).csv'\n",
    "df = get_table(filename)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AAL,AAMRQ,AAPL,ABI,ABS,ABT,ABX,ACKH,ACV,ADM,ADP,ADSK,AEE,AEP,AET,AGC,AGN,AHM,AIG,AIT,AL,ALL,AM,AMAT,AMD,AMGN,AMH,AMP,AN,ANDW,ANV,APD,AR,ARC,ARNC,AS,ASC,ASH,AT,ATI,AVP,AVY,AXP,AZA.A,BA,BAC,BAX,BBI,BC,BCO,BCR,BDK,BDX,BEAM,BEV,BF.B,BFI,BFO,BGG,BHGE,BHMSQ,BK,BKB,BLL,BLS,BLY,BMET,BMS,BMY,BNI,BNL,BOAT,BOL,BR,BSX,BT,BUD,C,CA,CAG,CAL,CAR,CAT,CB,CBB,CBE,CBS,CCB,CCI,CCK,CCTYQ,CEG,CEN,CFL,CG,CGP,CHA,CHRS,CI,CIN,CL,CLX,CMA,CMB,CMCSA,CMI,CNG,CNP,CNW,COMS,COP,COST,COV,CPB,CPQ,CR,CRR,CSCO,CSR,CSX,CTB,CTX,CVS,CVX,CYM,CYR,D,DALRQ,DCNAQ,DD,DDS,DE,DEC,DGN,DI,DIGI,DIS,DJ,DLX,DOV,DOW,DRI,DTE,DUK,DWD,DXC,EC,ECH,ECL,ECO,ED,EFU,EIX,EKDKQ,EMN,EMR,ENRNQ,ENS,ETN,ETR,ETS,EXC,F,FBF,FBO,FCN,FCX,FDC,FDX,FE,FG,FJ,FL,FLMIQ,FLTWQ,FMC,FMCC,FNMA,FTL.A,FWLT,G,GAPTQ,GAS,GD,GDW,GE,GFS.A,GIDL,GIS,GLD,GLK,GLW,GP,GPC,GPS,GPU,GR,GRA,GRN,GSX,GT,GTE,GWF,GWW,H,HAL,HAS,HCA,HD,HDLM,HES,HET,HI,HIG,HLT,HM,HNZ,HON,HP,HPC,HPH,HPQ,HRB,HRS,HSH,HSY,HUM,I,IBM,IFF,IKN,INCLF,INGR,INTC,IP,IPG,IR,ITT,ITW,JAVA,JCI,JCP,JH,JNJ,JOS,JP,JPM,JWN,K,KATE,KBH,KEY,KM,KMB,KMG,KO,KR,KRB,KRI,KWP,L,LB,LDG,LDW.B,LLX,LLY,LMT,LNC,LOR,LOW,LPX,LSI,LUB,LUV,M,MAS,MAT,MAY,MCD,MCIC,MCO,MD,MDP,MDR,MDT,MEA,MEE,MEL,MER,MII,MIL,MKG,MMC,MMM,MNR,MO,MOB,MRK,MRO,MS,MSFT,MSI,MST,MTLQQ,MU,MWI,MWV,MYG,MZIAQ,NAE,NAV,NC,NCC,NEE,NEM,NKE,NLC,NMK,NOC,NOVL,NRTLQ,NSC,NSI,NSM,NUE,NWL,NYN,NYT,OAT,OKE,OM,OMX,ONE,ORCL,ORX,OWENQ,OXY,PAC,PAS,PBI,PBY,PCAR,PCG,PCH,PD,PDG,PEG,PEL,PEP,PET,PFE,PG,PGL,PGN,PH,PHA,PHB,PHM,PKI,PLL,PMI,PNC,PNU,PPG,PPL,PPW,PRD,PVN,PX,PZE,R,RAD,RAL,RBD,RBK,RDC,RDS.A,RLM,RML,RNB,ROH,ROK,RRD,RSHCQ,RTN,RYAN,RYC,RYI,S,SAF,SB,SCI,SFA,SFS,SGID,SGP,SHN,SHW,SIAL,SK,SLB,SMI,SMS,SNA,SNT,SO,SPGI,SRR,STI,STJ,STO,SUN,SVU,SWK,SYY,T,TA,TAP,TCOMA,TDM,TEK,TEN,TGNA,TGT,THC,THY,TIN,TJX,TKR,TLAB,TMC,TMC.A,TMK,TNB,TOY,TRB,TRV,TRW,TWX,TX,TXN,TXT,TXU,UAWGQ,UCC,UCL,UCM,UIS,UK,UMG,UN,UNH,UNM,UNP,USB,USBC,USH,USHC,USS,UST,USW,UTX,VAT,VFC,VO,VZ,WAI,WB,WBA,WEN,WFC,WHR,WLA,WLL,WMB,WMT,WMX,WNDXQ,WOR,WWY,WY,WYE,X,XEL,XOM,XRX,YRCW\n"
     ]
    }
   ],
   "source": [
    "print(df.tickers[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tickers</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>date</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2022-06-08</th>\n",
       "      <td>[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2022-06-09</th>\n",
       "      <td>[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2022-06-21</th>\n",
       "      <td>[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2022-06-21</th>\n",
       "      <td>[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2022-06-28</th>\n",
       "      <td>[A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      tickers\n",
       "date                                                         \n",
       "2022-06-08  [A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...\n",
       "2022-06-09  [A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...\n",
       "2022-06-21  [A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...\n",
       "2022-06-21  [A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,...\n",
       "2022-06-28  [A, AAL, AAP, AAPL, ABBV, ABC, ABMD, ABT, ACN,..."
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Convert ticker column from csv to list, then sort.\n",
    "df['tickers'] = df['tickers'].apply(lambda x: sorted(x.split(',')))\n",
    "df.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['AAL', 'AAMRQ', 'AAPL', 'ABI', 'ABS', 'ABT', 'ABX', 'ACKH', 'ACV', 'ADM', 'ADP', 'ADSK', 'AEE', 'AEP', 'AET', 'AGC', 'AGN', 'AHM', 'AIG', 'AIT', 'AL', 'ALL', 'AM', 'AMAT', 'AMD', 'AMGN', 'AMH', 'AMP', 'AN', 'ANDW', 'ANV', 'APD', 'AR', 'ARC', 'ARNC', 'AS', 'ASC', 'ASH', 'AT', 'ATI', 'AVP', 'AVY', 'AXP', 'AZA.A', 'BA', 'BAC', 'BAX', 'BBI', 'BC', 'BCO', 'BCR', 'BDK', 'BDX', 'BEAM', 'BEV', 'BF.B', 'BFI', 'BFO', 'BGG', 'BHGE', 'BHMSQ', 'BK', 'BKB', 'BLL', 'BLS', 'BLY', 'BMET', 'BMS', 'BMY', 'BNI', 'BNL', 'BOAT', 'BOL', 'BR', 'BSX', 'BT', 'BUD', 'C', 'CA', 'CAG', 'CAL', 'CAR', 'CAT', 'CB', 'CBB', 'CBE', 'CBS', 'CCB', 'CCI', 'CCK', 'CCTYQ', 'CEG', 'CEN', 'CFL', 'CG', 'CGP', 'CHA', 'CHRS', 'CI', 'CIN', 'CL', 'CLX', 'CMA', 'CMB', 'CMCSA', 'CMI', 'CNG', 'CNP', 'CNW', 'COMS', 'COP', 'COST', 'COV', 'CPB', 'CPQ', 'CR', 'CRR', 'CSCO', 'CSR', 'CSX', 'CTB', 'CTX', 'CVS', 'CVX', 'CYM', 'CYR', 'D', 'DALRQ', 'DCNAQ', 'DD', 'DDS', 'DE', 'DEC', 'DGN', 'DI', 'DIGI', 'DIS', 'DJ', 'DLX', 'DOV', 'DOW', 'DRI', 'DTE', 'DUK', 'DWD', 'DXC', 'EC', 'ECH', 'ECL', 'ECO', 'ED', 'EFU', 'EIX', 'EKDKQ', 'EMN', 'EMR', 'ENRNQ', 'ENS', 'ETN', 'ETR', 'ETS', 'EXC', 'F', 'FBF', 'FBO', 'FCN', 'FCX', 'FDC', 'FDX', 'FE', 'FG', 'FJ', 'FL', 'FLMIQ', 'FLTWQ', 'FMC', 'FMCC', 'FNMA', 'FTL.A', 'FWLT', 'G', 'GAPTQ', 'GAS', 'GD', 'GDW', 'GE', 'GFS.A', 'GIDL', 'GIS', 'GLD', 'GLK', 'GLW', 'GP', 'GPC', 'GPS', 'GPU', 'GR', 'GRA', 'GRN', 'GSX', 'GT', 'GTE', 'GWF', 'GWW', 'H', 'HAL', 'HAS', 'HCA', 'HD', 'HDLM', 'HES', 'HET', 'HI', 'HIG', 'HLT', 'HM', 'HNZ', 'HON', 'HP', 'HPC', 'HPH', 'HPQ', 'HRB', 'HRS', 'HSH', 'HSY', 'HUM', 'I', 'IBM', 'IFF', 'IKN', 'INCLF', 'INGR', 'INTC', 'IP', 'IPG', 'IR', 'ITT', 'ITW', 'JAVA', 'JCI', 'JCP', 'JH', 'JNJ', 'JOS', 'JP', 'JPM', 'JWN', 'K', 'KATE', 'KBH', 'KEY', 'KM', 'KMB', 'KMG', 'KO', 'KR', 'KRB', 'KRI', 'KWP', 'L', 'LB', 'LDG', 'LDW.B', 'LLX', 'LLY', 'LMT', 'LNC', 'LOR', 'LOW', 'LPX', 'LSI', 'LUB', 'LUV', 'M', 'MAS', 'MAT', 'MAY', 'MCD', 'MCIC', 'MCO', 'MD', 'MDP', 'MDR', 'MDT', 'MEA', 'MEE', 'MEL', 'MER', 'MII', 'MIL', 'MKG', 'MMC', 'MMM', 'MNR', 'MO', 'MOB', 'MRK', 'MRO', 'MS', 'MSFT', 'MSI', 'MST', 'MTLQQ', 'MU', 'MWI', 'MWV', 'MYG', 'MZIAQ', 'NAE', 'NAV', 'NC', 'NCC', 'NEE', 'NEM', 'NKE', 'NLC', 'NMK', 'NOC', 'NOVL', 'NRTLQ', 'NSC', 'NSI', 'NSM', 'NUE', 'NWL', 'NYN', 'NYT', 'OAT', 'OKE', 'OM', 'OMX', 'ONE', 'ORCL', 'ORX', 'OWENQ', 'OXY', 'PAC', 'PAS', 'PBI', 'PBY', 'PCAR', 'PCG', 'PCH', 'PD', 'PDG', 'PEG', 'PEL', 'PEP', 'PET', 'PFE', 'PG', 'PGL', 'PGN', 'PH', 'PHA', 'PHB', 'PHM', 'PKI', 'PLL', 'PMI', 'PNC', 'PNU', 'PPG', 'PPL', 'PPW', 'PRD', 'PVN', 'PX', 'PZE', 'R', 'RAD', 'RAL', 'RBD', 'RBK', 'RDC', 'RDS.A', 'RLM', 'RML', 'RNB', 'ROH', 'ROK', 'RRD', 'RSHCQ', 'RTN', 'RYAN', 'RYC', 'RYI', 'S', 'SAF', 'SB', 'SCI', 'SFA', 'SFS', 'SGID', 'SGP', 'SHN', 'SHW', 'SIAL', 'SK', 'SLB', 'SMI', 'SMS', 'SNA', 'SNT', 'SO', 'SPGI', 'SRR', 'STI', 'STJ', 'STO', 'SUN', 'SVU', 'SWK', 'SYY', 'T', 'TA', 'TAP', 'TCOMA', 'TDM', 'TEK', 'TEN', 'TGNA', 'TGT', 'THC', 'THY', 'TIN', 'TJX', 'TKR', 'TLAB', 'TMC', 'TMC.A', 'TMK', 'TNB', 'TOY', 'TRB', 'TRV', 'TRW', 'TWX', 'TX', 'TXN', 'TXT', 'TXU', 'UAWGQ', 'UCC', 'UCL', 'UCM', 'UIS', 'UK', 'UMG', 'UN', 'UNH', 'UNM', 'UNP', 'USB', 'USBC', 'USH', 'USHC', 'USS', 'UST', 'USW', 'UTX', 'VAT', 'VFC', 'VO', 'VZ', 'WAI', 'WB', 'WBA', 'WEN', 'WFC', 'WHR', 'WLA', 'WLL', 'WMB', 'WMT', 'WMX', 'WNDXQ', 'WOR', 'WWY', 'WY', 'WYE', 'X', 'XEL', 'XOM', 'XRX', 'YRCW']\n"
     ]
    }
   ],
   "source": [
    "print(df.tickers[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tickers</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>date</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1996-01-02</th>\n",
       "      <td>[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1996-01-03</th>\n",
       "      <td>[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1996-01-04</th>\n",
       "      <td>[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1996-01-10</th>\n",
       "      <td>[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1996-01-11</th>\n",
       "      <td>[AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      tickers\n",
       "date                                                         \n",
       "1996-01-02  [AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A...\n",
       "1996-01-03  [AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A...\n",
       "1996-01-04  [AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A...\n",
       "1996-01-10  [AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A...\n",
       "1996-01-11  [AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A..."
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Replace SYMBOL-yyyymm with SYMBOL.\n",
    "df['tickers'] = [[ticker.split('-')[0] for ticker in tickers] for tickers in df['tickers']]\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['AAL', 'AAMRQ', 'AAPL', 'ABI', 'ABS', 'ABT', 'ABX', 'ACKH', 'ACV', 'ADM', 'ADP', 'ADSK', 'AEE', 'AEP', 'AET', 'AGC', 'AGN', 'AHM', 'AIG', 'AIT', 'AL', 'ALL', 'AM', 'AMAT', 'AMD', 'AMGN', 'AMH', 'AMP', 'AN', 'ANDW', 'ANV', 'APD', 'AR', 'ARC', 'ARNC', 'AS', 'ASC', 'ASH', 'AT', 'ATI', 'AVP', 'AVY', 'AXP', 'AZA.A', 'BA', 'BAC', 'BAX', 'BBI', 'BC', 'BCO', 'BCR', 'BDK', 'BDX', 'BEAM', 'BEV', 'BF.B', 'BFI', 'BFO', 'BGG', 'BHGE', 'BHMSQ', 'BK', 'BKB', 'BLL', 'BLS', 'BLY', 'BMET', 'BMS', 'BMY', 'BNI', 'BNL', 'BOAT', 'BOL', 'BR', 'BSX', 'BT', 'BUD', 'C', 'CA', 'CAG', 'CAL', 'CAR', 'CAT', 'CB', 'CBB', 'CBE', 'CBS', 'CCB', 'CCI', 'CCK', 'CCTYQ', 'CEG', 'CEN', 'CFL', 'CG', 'CGP', 'CHA', 'CHRS', 'CI', 'CIN', 'CL', 'CLX', 'CMA', 'CMB', 'CMCSA', 'CMI', 'CNG', 'CNP', 'CNW', 'COMS', 'COP', 'COST', 'COV', 'CPB', 'CPQ', 'CR', 'CRR', 'CSCO', 'CSR', 'CSX', 'CTB', 'CTX', 'CVS', 'CVX', 'CYM', 'CYR', 'D', 'DALRQ', 'DCNAQ', 'DD', 'DDS', 'DE', 'DEC', 'DGN', 'DI', 'DIGI', 'DIS', 'DJ', 'DLX', 'DOV', 'DOW', 'DRI', 'DTE', 'DUK', 'DWD', 'DXC', 'EC', 'ECH', 'ECL', 'ECO', 'ED', 'EFU', 'EIX', 'EKDKQ', 'EMN', 'EMR', 'ENRNQ', 'ENS', 'ETN', 'ETR', 'ETS', 'EXC', 'F', 'FBF', 'FBO', 'FCN', 'FCX', 'FDC', 'FDX', 'FE', 'FG', 'FJ', 'FL', 'FLMIQ', 'FLTWQ', 'FMC', 'FMCC', 'FNMA', 'FTL.A', 'FWLT', 'G', 'GAPTQ', 'GAS', 'GD', 'GDW', 'GE', 'GFS.A', 'GIDL', 'GIS', 'GLD', 'GLK', 'GLW', 'GP', 'GPC', 'GPS', 'GPU', 'GR', 'GRA', 'GRN', 'GSX', 'GT', 'GTE', 'GWF', 'GWW', 'H', 'HAL', 'HAS', 'HCA', 'HD', 'HDLM', 'HES', 'HET', 'HI', 'HIG', 'HLT', 'HM', 'HNZ', 'HON', 'HP', 'HPC', 'HPH', 'HPQ', 'HRB', 'HRS', 'HSH', 'HSY', 'HUM', 'I', 'IBM', 'IFF', 'IKN', 'INCLF', 'INGR', 'INTC', 'IP', 'IPG', 'IR', 'ITT', 'ITW', 'JAVA', 'JCI', 'JCP', 'JH', 'JNJ', 'JOS', 'JP', 'JPM', 'JWN', 'K', 'KATE', 'KBH', 'KEY', 'KM', 'KMB', 'KMG', 'KO', 'KR', 'KRB', 'KRI', 'KWP', 'L', 'LB', 'LDG', 'LDW.B', 'LLX', 'LLY', 'LMT', 'LNC', 'LOR', 'LOW', 'LPX', 'LSI', 'LUB', 'LUV', 'M', 'MAS', 'MAT', 'MAY', 'MCD', 'MCIC', 'MCO', 'MD', 'MDP', 'MDR', 'MDT', 'MEA', 'MEE', 'MEL', 'MER', 'MII', 'MIL', 'MKG', 'MMC', 'MMM', 'MNR', 'MO', 'MOB', 'MRK', 'MRO', 'MS', 'MSFT', 'MSI', 'MST', 'MTLQQ', 'MU', 'MWI', 'MWV', 'MYG', 'MZIAQ', 'NAE', 'NAV', 'NC', 'NCC', 'NEE', 'NEM', 'NKE', 'NLC', 'NMK', 'NOC', 'NOVL', 'NRTLQ', 'NSC', 'NSI', 'NSM', 'NUE', 'NWL', 'NYN', 'NYT', 'OAT', 'OKE', 'OM', 'OMX', 'ONE', 'ORCL', 'ORX', 'OWENQ', 'OXY', 'PAC', 'PAS', 'PBI', 'PBY', 'PCAR', 'PCG', 'PCH', 'PD', 'PDG', 'PEG', 'PEL', 'PEP', 'PET', 'PFE', 'PG', 'PGL', 'PGN', 'PH', 'PHA', 'PHB', 'PHM', 'PKI', 'PLL', 'PMI', 'PNC', 'PNU', 'PPG', 'PPL', 'PPW', 'PRD', 'PVN', 'PX', 'PZE', 'R', 'RAD', 'RAL', 'RBD', 'RBK', 'RDC', 'RDS.A', 'RLM', 'RML', 'RNB', 'ROH', 'ROK', 'RRD', 'RSHCQ', 'RTN', 'RYAN', 'RYC', 'RYI', 'S', 'SAF', 'SB', 'SCI', 'SFA', 'SFS', 'SGID', 'SGP', 'SHN', 'SHW', 'SIAL', 'SK', 'SLB', 'SMI', 'SMS', 'SNA', 'SNT', 'SO', 'SPGI', 'SRR', 'STI', 'STJ', 'STO', 'SUN', 'SVU', 'SWK', 'SYY', 'T', 'TA', 'TAP', 'TCOMA', 'TDM', 'TEK', 'TEN', 'TGNA', 'TGT', 'THC', 'THY', 'TIN', 'TJX', 'TKR', 'TLAB', 'TMC', 'TMC.A', 'TMK', 'TNB', 'TOY', 'TRB', 'TRV', 'TRW', 'TWX', 'TX', 'TXN', 'TXT', 'TXU', 'UAWGQ', 'UCC', 'UCL', 'UCM', 'UIS', 'UK', 'UMG', 'UN', 'UNH', 'UNM', 'UNP', 'USB', 'USBC', 'USH', 'USHC', 'USS', 'UST', 'USW', 'UTX', 'VAT', 'VFC', 'VO', 'VZ', 'WAI', 'WB', 'WBA', 'WEN', 'WFC', 'WHR', 'WLA', 'WLL', 'WMB', 'WMT', 'WMX', 'WNDXQ', 'WOR', 'WWY', 'WY', 'WYE', 'X', 'XEL', 'XOM', 'XRX', 'YRCW']\n"
     ]
    }
   ],
   "source": [
    "print(df.tickers[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2656"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#check total length\n",
    "df['tickers'].shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1996-01-02'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#start date\n",
    "df.index[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'2022-06-28'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#end date\n",
    "df.index[df['tickers'].shape[0]-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#merge all tickers together into a list\n",
    "tickers_list=[]\n",
    "for i in range(df['tickers'].shape[0]):\n",
    "    tickers_list = tickers_list + df['tickers'][i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1319058"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tickers_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "#get unique tickers\n",
    "tickers_list_set=list(sorted(set(tickers_list)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1132"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#check total unique tickers\n",
    "len(tickers_list_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "tickers_list_set=pd.DataFrame(tickers_list_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1132, 1)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tickers_list_set.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "#filename = 'tickers_list_set.csv'\n",
    "#tickers_list_set.to_csv(filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "#output to a txt file in order to query from WRDS\n",
    "tickers_list_set.to_csv('sp500_tickers.txt', header=None, index=None, sep=' ', mode='a')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
